###載入套件及資料 ################################################
#Loading multiple .csv files as separate data frames
getwd()
[1] "G:/我的雲端硬碟/交換/MBA/1072/R/midterm"
folder <- "data/"
file_list <- list.files(path = folder, pattern = "*.csv")
#Read in each .csv file
for (i in 1:length(file_list)){
assign(file_list[i],
read.csv(paste(folder,file_list[i],sep=''),stringsAsFactors = F)
)}
#Rename data
geo <- olist_geolocation_dataset.csv
orders <- olist_orders_dataset.csv
cust <- olist_customers_dataset.csv
sellers <- olist_sellers_dataset.csv
products <- olist_products_dataset.csv
orderitems <- olist_order_items_dataset.csv
payments <- olist_order_payments_dataset.csv
nametrans <- product_category_name_translation.csv
reviews <- olist_order_reviews_dataset.csv
closed <- olist_closed_deals_dataset.csv
marketing <- olist_marketing_qualified_leads_dataset.csv
# 地理資料整理
geo$geolocation_lat<-round(geo$geolocation_lat,3)
geo$geolocation_lng<-round(geo$geolocation_lng,3)
selllocation<-geo %>% group_by(geolocation_city) %>% summarise(selllat = max(geolocation_lat),selllng=max(geolocation_lng))
custlocation<-geo %>% group_by(geolocation_city) %>% summarise(custlat = max(geolocation_lat),custlng=max(geolocation_lng))
# 時間資料處理
orders$order_approved_at<-as.Date(orders$order_approved_at,format="%Y-%m-%d %H:%M:%S")
orders$order_purchase_timestamp<-as.Date(orders$order_purchase_timestamp,format="%Y-%m-%d %H:%M:%S")
orders$order_delivered_carrier_date<-as.Date(orders$order_delivered_carrier_date,format="%Y-%m-%d %H:%M:%S")
orders$order_delivered_customer_date<-as.Date(orders$order_delivered_customer_date,format="%Y-%m-%d %H:%M:%S")
orders$order_estimated_delivery_date<-as.Date(orders$order_estimated_delivery_date,format="%Y-%m-%d %H:%M:%S")
table(orders$order_status)
approved canceled created delivered invoiced processing
2 625 5 96478 314 301
shipped unavailable
1107 609
orderitems$shipping_limit_date<-as.Date(orderitems$shipping_limit_date,format="%Y-%m-%d %H:%M:%S")
# 把各個資料合併
M_1 <- merge(orderitems,sellers,by.x="seller_id",by.y="seller_id")
M_2 <- merge(orders,cust,by.x="customer_id",by.y="customer_id")
M_3 <- merge(M_2,M_1,by="order_id")
M_4 <- merge(M_3,products,by="product_id")
M_5 <- merge(M_4,payments,by="order_id")
M_6 <- merge(M_5,selllocation,by.x="seller_city",by.y="geolocation_city")
M_7 <- merge(M_6,custlocation,by.x="customer_city",by.y="geolocation_city")
colnames(nametrans) <- c("product_category_name","product_category_name_english")
#計算買賣家之間的距離
dist_list <- list()
for (i in 1:nrow(M_7)) {
dist_list[[i]] <- gdist(lon.1 = M_7$selllng[i],
lat.1 = M_7$selllat[i],
lon.2 = M_7$custlng[i],
lat.2 = M_7$custlat[i],
units="miles")
}
M_7$distbtwn<-as.integer(dist_list)
M_8<-merge(M_7,nametrans,by="product_category_name")
# 置入巴西地理資料
Brazil<-map_data("world") %>% filter(region=="Brazil")
#排除界外外的資料
M_8 = M_8[M_8$selllat <= 5.27438888,]
M_8 = M_8[M_8$custlat <= 5.27438888,]
#M_15_1 <- review of orderitems、M_15_2 <- review of sellers with order informatio
M_15_1<-merge(x = reviews , y = orderitems, by = "order_id")
M_15_2<-merge(x = M_15_1 , y = sellers, by = "seller_id")
M_15_3<-merge(x = M_15_1, y = closed, by = "seller_id")
#畫出賣家所在地
ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
geom_point(data= M_8,aes(x=selllng,y=selllat,color=seller_state),size=0.2)
#畫出買家所在地
ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
geom_point(data= M_8,aes(x=custlng,y=custlat,color=customer_state),size=0.2)
#各產品類別銷售情況及所在地
ggplot() +
geom_bar(data= M_8,aes(product_category_name_english,fill=seller_state),width=1)+ coord_flip()
#賣家所在地數量
ggplot() +
geom_bar(data= M_8,aes(seller_state,fill=seller_state),width=1)
#####1-3.危機:一次性顧客過多
#抓出一次性購買消費者特性
Q <- group_by(olist_customers_dataset.csv, customer_unique_id) %>% summarise(nid=n())
Q <- filter(Q, nid==1)
M_13 <- merge(Q,cust, by="customer_unique_id")
M_14_1 <- merge(M_13, olist_orders_dataset.csv, by="customer_id", all=F)
M_14_2 <- merge(M_14_1,olist_order_items_dataset.csv, by="order_id", all=F)
M_14_3 <- merge(M_14_2, olist_products_dataset.csv, by="product_id", all=F)
colnames(product_category_name_translation.csv) <- c("product_category_name","product_category_name_english")
M_14_4 <- merge(M_14_3,product_category_name_translation.csv, by="product_category_name", all = F)
table(M_14_4$product_category_name_english) %>% sort() ##消費量最多為bed_beath
security_and_services fashion_childrens_clothes
2 7
la_cuisine cds_dvds_musicals
12 14
arts_and_craftmanship fashion_sport
19 28
home_comfort_2 diapers_and_hygiene
28 29
flowers furniture_mattress_and_upholstery
33 34
music fashio_female_clothing
34 40
party_supplies books_imported
42 57
dvds_blu_ray cine_photo
60 68
small_appliances_home_oven_and_coffee tablets_printing_image
75 79
furniture_bedroom costruction_tools_tools
94 99
fashion_male_clothing fashion_underwear_beach
118 118
christmas_supplies construction_tools_safety
146 183
signaling_and_security computers
187 197
art agro_industry_and_commerce
201 203
costruction_tools_garden home_appliances_2
221 224
fashion_shoes fixed_telephony
240 252
food_drink industry_commerce_and_business
255 256
books_technical kitchen_dining_laundry_garden_furniture
261 263
air_conditioning construction_tools_lights
274 292
market_place drinks
297 335
audio home_confort
346 401
furniture_living_room food
455 466
books_general_interest home_construction
529 559
home_appliances small_appliances
633 653
musical_instruments construction_tools_construction
655 868
luggage_accessories consoles_games
1040 1097
office_furniture fashion_bags_accessories
1610 1768
pet_shop stationery
1826 2403
electronics baby
2676 2914
perfumery cool_stuff
3231 3656
toys auto
3901 4041
garden_tools telephony
4097 4314
watches_gifts housewares
5667 6527
computers_accessories furniture_decor
7275 7513
sports_leisure health_beauty
7954 9103
bed_bath_table
10001
onece <- group_by(M_14_4, product_category_name_english) %>% summarise(mean=mean(price), sum=sum(price))##平均消費額最多為computers與總消費最多為health_beauty
#####1-4.產品類別
#尋找出銷售最高的產品
#olist_products_dataset.csv : 32,951 consumers
table(products$product_category_name) %>% sort(decreasing = TRUE) ##消費最多是 cama_mesa_banho 3029
cama_mesa_banho
3029
esporte_lazer
2867
moveis_decoracao
2657
beleza_saude
2444
utilidades_domesticas
2335
automotivo
1900
informatica_acessorios
1639
brinquedos
1411
relogios_presentes
1329
telefonia
1134
bebes
919
perfumaria
868
fashion_bolsas_e_acessorios
849
papelaria
849
cool_stuff
789
ferramentas_jardim
753
pet_shop
719
610
eletronicos
517
construcao_ferramentas_construcao
400
eletrodomesticos
370
malas_acessorios
349
consoles_games
317
moveis_escritorio
309
instrumentos_musicais
289
eletroportateis
231
casa_construcao
225
livros_interesse_geral
216
fashion_calcados
173
moveis_sala
156
climatizacao
124
livros_tecnicos
123
telefonia_fixa
116
casa_conforto
111
alimentos_bebidas
104
market_place
104
fashion_roupa_masculina
95
moveis_cozinha_area_de_servico_jantar_e_jardim
94
sinalizacao_e_seguranca
93
construcao_ferramentas_seguranca
91
eletrodomesticos_2
90
construcao_ferramentas_jardim
88
alimentos
82
bebidas
81
construcao_ferramentas_iluminacao
78
agro_industria_e_comercio
74
industria_comercio_e_negocios
68
artigos_de_natal
65
audio
58
artes
55
fashion_underwear_e_moda_praia
53
dvds_blu_ray
48
moveis_quarto
45
construcao_ferramentas_ferramentas
39
livros_importados
31
portateis_casa_forno_e_cafe
31
pcs
30
cine_foto
28
fashion_roupa_feminina
27
musica
27
artigos_de_festas
26
artes_e_artesanato
19
fashion_esporte
19
flores
14
fraldas_higiene
12
la_cuisine
10
moveis_colchao_e_estofado
10
portateis_cozinha_e_preparadores_de_alimentos
10
tablets_impressao_imagem
9
casa_conforto_2
5
fashion_roupa_infanto_juvenil
5
pc_gamer
3
seguros_e_servicos
2
cds_dvds_musicais
1
#互動式圓餅圖 : 不同種類商品的平均評分&其銷量佔比(商品種類後面的數字是其平均分數)
Product2 <- group_by(M_15_3,business_segment)%>%summarize(score2=round(mean(review_score),2),percent=n()/nrow(M_15_3)*100)
plot_ly(Product2, labels = paste(Product2$business_segment,Product2$score2),values = Product2$percent, type = 'pie') %>%
layout(title = '各種類商品銷售百分比',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
#####1-5.不同時段下的銷售量
#整理消費者購買月份
ts = as.POSIXct(as.character(olist_orders_dataset.csv$order_purchase_timestamp) , format="%Y-%m-%d %T")
ts.bym <- cut(ts, breaks = "month")
dfts <- data.frame(ts,ts.bym)
table(dfts$ts.bym)
2016-09-01 2016-10-01 2016-11-01 2016-12-01 2017-01-01 2017-02-01 2017-03-01
4 324 0 1 800 1780 2682
2017-04-01 2017-05-01 2017-06-01 2017-07-01 2017-08-01 2017-09-01 2017-10-01
2404 3700 3245 4026 4331 4285 4631
2017-11-01 2017-12-01 2018-01-01 2018-02-01 2018-03-01 2018-04-01 2018-05-01
7544 5673 7269 6728 7211 6939 6873
2018-06-01 2018-07-01 2018-08-01 2018-09-01 2018-10-01
6167 6292 6512 16 4
ggplot(dfts, aes(ts.bym),las=1)+ geom_bar() ##最多銷售月份為2018-1-1
#每日尖峰時段
ts.byH <- format(ts,format="%H") %>% data.frame()
ggplot(ts.byH ,aes(.))+ geom_bar()
table(ts.byH)
ts.byH
00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
2394 1170 510 272 206 188 502 1231 2967 4785 6177 6578 5995 6518 6569 6454
16 17 18 19 20 21 22 23
6675 6150 5769 5982 6193 6217 5816 4123
##商品種類-月份
M_10 <- merge(olist_orders_dataset.csv, olist_order_items_dataset.csv, by.x = "order_id")
M_11 <- merge(M_10, olist_products_dataset.csv, by.x = "product_id")
M_12 <- merge(M_11, product_category_name_translation.csv, by.x= "product_category_name", by.y="ï..product_category_name")
D <- select(M_12,product_category_name_english, order_purchase_timestamp)
DD = as.POSIXct(as.character(D$order_purchase_timestamp) , format="%Y-%m-%d %T")
DD.bym <- format(DD,format="%m")
D$bym <- DD.bym
E <- filter(D, product_category_name_english=="bed_bath_table"|product_category_name_english=="health_beauty"|product_category_name_english=="sports_leisure"|product_category_name_english=="furniture_decor"|product_category_name_english=="computers_accessories"|product_category_name_english=="housewares" )
summary(D$product_category_name_english) %>% sort()
##table(D$product_category_name_english, D$bym)
ggplot(E ,aes(product_category_name_english, fill=bym))+ geom_bar() ##抓出銷售量最高的六種商品做每個月分的比例圖
ggplot(D, aes(product_category_name_english, fill=bym, width=1))+ geom_bar()+coord_flip() ##每種商品每個月分的銷售狀況
##按照小時區分
DD.byh <- format(DD,format="%H")
D$byh <- DD.byh
##table(D$product_category_name_english, D$byh)
ggplot(D, aes(product_category_name_english, fill=byh, width=1))+ geom_bar()+coord_flip()##按照時間點的銷售量
#按照小時區分heatmap版本
#Rearranging data from D data frame
deaf = select(D, product_category_name_english,byh)
shook = group_by(deaf, byh, product_category_name_english) %>% summarise(n=n())
jump = spread(shook, byh, n)
jump[is.na(jump)] <- 0
#Converting product category column into rowname
jump = column_to_rownames(jump, var = "product_category_name_english")
jump = as.matrix(jump)
#Using plot_ly () to draw the interactive heatmap
plot_ly(x=colnames(jump), y=rownames(jump), z = jump, type = "heatmap")
#####2-1.每月的訂單數,新進的買、賣家數
seller_TOP100 <-
M_8 %>% group_by(seller_id,product_category_name_english) %>% summarise(
mount = sum(price),
num = n()) %>% arrange(desc(num)) %>% head(100)
seller_TOP100_2 <- merge(seller_TOP100,closed[,c(1,2,6,8,9,12)],by="seller_id",all.x = T)
marketingdata <- merge(closed[,c(1,2,6,8,9,12)],marketing[,c(1,4)],by = "mql_id")
M_9 <- merge(M_8,marketingdata,by = "seller_id",all.x = T)
#7 畫出Olist的成長概況:每月的訂單數、新進的買賣家數
str(M_9)
'data.frame': 113782 obs. of 45 variables:
$ seller_id : chr "0015a82c2db000af6aaaf3ae2ecb0532" "0015a82c2db000af6aaaf3ae2ecb0532" "0015a82c2db000af6aaaf3ae2ecb0532" "001cca7ae9ae17fb1caed9dfb1094831" ...
$ product_category_name : chr "eletroportateis" "eletroportateis" "eletroportateis" "ferramentas_jardim" ...
$ customer_city : chr "entre rios de minas" "paracatu" "umuarama" "concordia" ...
$ seller_city : chr "santo andre" "santo andre" "santo andre" "cariacica" ...
$ order_id : chr "7f39ba4c9052be115350065d07583cac" "9dc8d1a6f16f1b89874c29c9d8d30447" "d455a8cb295653b55abda06d434ab492" "226e16e7eb6dcefe180d80127468de8b" ...
$ product_id : chr "a2ff5a97bf95719e38ea2e3b4105bce8" "a2ff5a97bf95719e38ea2e3b4105bce8" "a2ff5a97bf95719e38ea2e3b4105bce8" "08574b074924071f4e201e151b152b4e" ...
$ customer_id : chr "d7fc82cbeafea77bd0a8fbbf6296e387" "d9442164acf4b03109425633efaa0cfc" "944b72539d7e1f7f7fc6e46639ef1fe3" "1063de5cc98f570e53eccb32723746e2" ...
$ order_status : chr "delivered" "delivered" "delivered" "delivered" ...
$ order_purchase_timestamp : Date, format: "2017-10-18" "2017-10-12" ...
$ order_approved_at : Date, format: "2017-10-18" "2017-10-12" ...
$ order_delivered_carrier_date : Date, format: "2017-10-20" "2017-10-17" ...
$ order_delivered_customer_date: Date, format: "2017-10-27" "2017-10-24" ...
$ order_estimated_delivery_date: Date, format: "2017-11-09" "2017-11-06" ...
$ customer_unique_id : chr "9de5797cddb92598755a0f76383ddbbb" "9915eb9f74b6c11aaf04833f65b00e93" "3c7e305796add66698959fc7ad176f6b" "20ae7f5cb408302fe02d95d78c8d4f01" ...
$ customer_zip_code_prefix : int 35490 38600 87502 89700 23954 8161 20780 31525 84045 13348 ...
$ customer_state : chr "MG" "MG" "PR" "SC" ...
$ order_item_id : int 1 1 1 1 4 1 1 1 1 2 ...
$ shipping_limit_date : Date, format: "2017-10-24" "2017-10-18" ...
$ price : num 895 895 895 99 99.9 99 99 110 99.5 89 ...
$ freight_value : num 21 21 21 46 14.9 ...
$ seller_zip_code_prefix : int 9080 9080 9080 29156 29156 29156 29156 29156 29156 29156 ...
$ seller_state : chr "SP" "SP" "SP" "ES" ...
$ product_name_lenght : int 40 40 40 36 34 36 36 39 33 36 ...
$ product_description_lenght : int 849 849 849 450 511 450 450 500 509 450 ...
$ product_photos_qty : int 2 2 2 1 4 1 1 1 1 1 ...
$ product_weight_g : int 11800 11800 11800 9000 8875 9000 9000 8325 9100 9000 ...
$ product_length_cm : int 40 40 40 42 40 42 42 43 42 42 ...
$ product_height_cm : int 43 43 43 12 14 12 12 14 13 12 ...
$ product_width_cm : int 36 36 36 39 43 39 39 33 39 39 ...
$ payment_sequential : int 1 1 1 1 1 1 1 1 1 1 ...
$ payment_type : chr "credit_card" "credit_card" "credit_card" "credit_card" ...
$ payment_installments : int 8 4 10 1 4 4 1 2 1 6 ...
$ payment_value : num 916 916 916 290 574 ...
$ selllat : num -7.22 -7.22 -7.22 -20.13 -20.13 ...
$ selllng : num -36.6 -36.6 -36.6 -40.3 -40.3 ...
$ custlat : num -20.7 -17.1 -23.7 -27.2 -22.9 ...
$ custlng : num -44.1 -43.4 -53.2 -51.9 -44.2 ...
$ distbtwn : int 1049 819 1580 880 313 449 250 245 709 482 ...
$ product_category_name_english: chr "small_appliances" "small_appliances" "small_appliances" "garden_tools" ...
$ mql_id : chr NA NA NA NA ...
$ business_segment : chr NA NA NA NA ...
$ lead_behaviour_profile : chr NA NA NA NA ...
$ has_company : chr NA NA NA NA ...
$ business_type : chr NA NA NA NA ...
$ origin : chr NA NA NA NA ...
class(M_9$time)
[1] "NULL"
unique(M_9$time_group)
NULL
M_9[,9]=as.POSIXct(M_9[,9], format="%Y-%m-%d")
M_9$time_group <- format(M_9$order_purchase_timestamp,"%Y%m") %>% as.numeric()
M_9$time <- format(M_9$order_purchase_timestamp,"%Y%m") %>% paste0(.,"28") %>% as.Date("%Y%m%d")
time_group <- unique(M_9$time_group) %>% sort()
num_seller_1 = sapply(1:length(time_group), # start by 2, so i-1 = 1
function(i) setdiff(M_9$seller_id[M_9$time_group==time_group[i]],unique(M_9$seller_id[M_9$time_group<time_group[i]])) %>% length)
num_seller = c(0,num_seller_1[-24])
num_customer_1 = sapply(1:length(time_group), # start by 2, so i-1 = 1
function(i) setdiff(M_9$customer_unique_id[M_9$time_group==time_group[i]],unique(M_9$customer_unique_id[M_9$time_group<time_group[i]])) %>% length)
num_customer = c(0,num_customer_1[-24])
num_order <- M_9 %>% group_by(time) %>% summarise(
num_order = length(unique(order_id))
)
plot <- cbind(time=(sort(unique(M_9$time))),num_seller,num_customer,num_order[,2])
#ggplotly_try 嘗試直接顯示4個軸度
plotly = ggplot(plot, mapping =aes(x=as.factor(time), y=num_order, col=num_customer)) +
geom_point(mapping = aes(size=num_seller)) +
geom_text(mapping =aes(label=time), size=0.5) +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) +
ylab("Order") +
xlab("Time")
plotly
ggplotly(plotly)
#####2-2.時間序列的折線圖:每月的訂單數,新進的買、賣家數
#畫出時間序列的折線圖
data.ts<-zoo(plot,plot[,"time"])
plot <- data.ts[1:24,-1]
dygraph(plot,main = "Olist新進買賣家及訂單數走勢") %>%
dySeries("num_customer", label = "新進顧客數")%>%
dySeries("num_order", label = "訂單數") %>%
dySeries("num_seller", axis = 'y2', label = "新進賣家數") %>%
dyOptions( axisLineColor="orange",
gridLineColor="indianred" , fillGraph = F,fillAlpha = 0.2,
drawGrid = TRUE,drawPoints=TRUE, pointSize = 1 ) %>%
dyAxis("x", label = " 日期 ", drawGrid = F) %>%
dyAxis("y", label = " ", drawGrid = T) %>%
dyHighlight(highlightCircleSize = 3,
highlightSeriesBackgroundAlpha = 0.2) %>%
dyOptions(colors = RColorBrewer::brewer.pal(3, 'Dark2')) %>%
dyRangeSelector(height = 1)
#####2-3.產品類別評分及銷售情形
#不同商品種類的平均評價分數排名 : air conditioning和handcrafted的平均評價最好
Product <- group_by(M_15_3,business_segment)%>%summarize(score=mean(review_score),quantity=n(),price=mean(price))
arrange(Product,score)
sort(table(M_15_3$business_segment))
party games_consoles
3 4
air_conditioning gifts
5 8
handcrafted music_instruments
11 25
baby food_drink
42 49
phone_mobile fashion_accessories
56 60
stationery small_appliances
60 69
home_office_furniture food_supplement
82 100
books toys
106 121
computers bags_backpacks
125 140
home_appliances car_accessories
141 164
sports_leisure bed_bath_table
179 188
audio_video_electronics pet
275 288
construction_tools_house_garden home_decor
319 453
household_utilities watches
562 594
health_beauty
821
#不同類別的平均評價 互動式散點圖
g=qplot(x=Product$score,
y=Product$quantity,
geom="point",
main = "不同商品類別的評價、銷量、價格",
xlab="平均分數",
ylab="賣出數量",
color= Product$business_segment,
size = Product$price
)
ggplotly(g)
#####2-4.泡泡圖:產品類別評分及銷售情形 -> 發現評分表現下滑
#####3-1.州(state)銷售額 #####3-1.州(state)評分 #####3-1.城市(city)銷售額 #####3-1.城市(city)評分 ################以下孫嘉力部分
#score: the average score of each city
score <- M_15_2 %>%
group_by(seller_city) %>%
summarise(
score = mean(review_score),
pricesum = sum(price)+sum(freight_value)
)
#geolocation: private dataframe, with state,lon and lat of each city
geolocation <- olist_geolocation_dataset.csv
geolocation['seller_city'] = geo['geolocation_city']
geolocation['state'] = geo["geolocation_state"]
geolocation['geolocation_zip_code_prefix'] = NULL
geolocation <- geo %>%
group_by(geolocation_city) %>%
summarise(
lat = mean(geolocation_lat),
lng = mean(geolocation_lng),
seller_city = geolocation_city[1],
state = geolocation_state[1]
)
#draw review of each state and use addPolygon to fig
city_score <- merge(score,geolocation,by="seller_city")
state_score <- city_score %>%
group_by(state) %>%
summarise(
score = mean(score),
pricesum = sum(pricesum)
)
#import Brazill json data
states <- geojsonio::geojson_read("C:\\Users\\User\\Desktop\\Group3\\midterm\\Brazil.json", what = "sp")
#color setting
bins_score <- c(0,1.0,2.0,3.0,4.0,5.0)
bins_money <- c(0,500,5000,10000,50000,100000,1000000,Inf)
pal_score <- colorBin("Blues", domain = state_score$score, bins = bins_score)
pal_money <- colorBin("Reds" ,domain = state_score$pricesum, bins = bins_money)
#add score Info of State
labels <- sprintf(
"<strong>%s</strong><br/> score: %g <br/> total price: %g ",
state_score$state,
state_score$score,
state_score$pricesum
) %>% lapply(htmltools::HTML)
citylabels <- sprintf(
"<strong>city: %s</strong><br/> score: %g <br/> total price: %g ",
city_score$seller_city,
city_score$score,
city_score$pricesum
)%>% lapply(htmltools::HTML)
#form a map of Brazil
map_Brazil <- leaflet(states) %>%
setView(lat=-22.074022, lng=-48.74026, zoom = 4) %>%
addProviderTiles("MapBox",group = "map", options = providerTileOptions(
id = "mapbox.light",
accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN')))
#draw city inform
map_Brazil <- map_Brazil %>%
addProviderTiles("MapBox",group = "city", options = providerTileOptions(
id = "mapbox.light",
accessToken = Sys.getenv('MAPBOX_ACCESS_TOKEN'))) %>%
addMarkers(
lng = city_score$lng,
lat = city_score$lat,
label = citylabels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 4px"),
textsize = "15px",
direction = "auto"),
clusterOptions = markerClusterOptions(),
options = popupOptions(closeButton = TRUE),
group = "city"
)
#draw state labels
map_Brazil <- map_Brazil %>%
#add state score information
addPolygons(
fillColor = ~pal_score(state_score$score),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels,
labelOptions = labelOptions(
style = list
("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"),
group = "state_score") %>%
#add state price information
addPolygons(
fillColor = ~pal_money(state_score$pricesum),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto"),
group = "state_price"
) %>%
#add legends
addLegend(pal = pal_money, values = ~state_score$pricesum ,opacity =0.7,title = "price legend",
position = "bottomleft",group = "state_price" ) %>%
addLegend(pal = pal_score, values = ~state_score$score, opacity = 0.7, title = "score legend",
position = "bottomleft",group = "state_score") %>%
# add layer Control
addLayersControl(
overlayGroups = c("city","state_score","state_price")
) %>% hideGroup("state_price") %>%
# add mini map
addProviderTiles(providers$Esri.WorldStreetMap) %>%
addMiniMap(
tiles = providers$Esri.WorldStreetMap,
toggleDisplay = TRUE)
#結果
map_Brazil
#####4-1.解釋新變數
#8 找對評分的重要變數
#####王
M_9 %>% colnames() #2:4,9:11,15,17,19:20,22,24,25,32,38
[1] "seller_id" "product_category_name"
[3] "customer_city" "seller_city"
[5] "order_id" "product_id"
[7] "customer_id" "order_status"
[9] "order_purchase_timestamp" "order_approved_at"
[11] "order_delivered_carrier_date" "order_delivered_customer_date"
[13] "order_estimated_delivery_date" "customer_unique_id"
[15] "customer_zip_code_prefix" "customer_state"
[17] "order_item_id" "shipping_limit_date"
[19] "price" "freight_value"
[21] "seller_zip_code_prefix" "seller_state"
[23] "product_name_lenght" "product_description_lenght"
[25] "product_photos_qty" "product_weight_g"
[27] "product_length_cm" "product_height_cm"
[29] "product_width_cm" "payment_sequential"
[31] "payment_type" "payment_installments"
[33] "payment_value" "selllat"
[35] "selllng" "custlat"
[37] "custlng" "distbtwn"
[39] "product_category_name_english" "mql_id"
[41] "business_segment" "lead_behaviour_profile"
[43] "has_company" "business_type"
[45] "origin" "time_group"
[47] "time"
score <- M_9[,c(5,10:12,16,17,19,20,22,24:29,32,38,46)] %>% group_by(order_id) %>% summarise(
ship13 = mean(order_delivered_customer_date-order_approved_at),
ship12 = mean(order_delivered_carrier_date-order_approved_at),
ship23 = mean(order_delivered_customer_date-order_delivered_carrier_date),
ship_ratio = mean(freight_value/price),
price = sum(price),
AVEvolume = sum(product_length_cm*product_height_cm*product_width_cm)/max(order_item_id),
description = sum(product_description_lenght)/max(order_item_id),
photo_num = sum(product_photos_qty)/max(order_item_id),
pay_installment = sum(payment_installments)/max(order_item_id),
dist = sum(distbtwn)/max(order_item_id),
customer_state = customer_state[1],
seller_state = seller_state[1],
time_group = time_group[1]
)
reviews2 <- reviews %>% group_by(order_id) %>% summarise(
score = mean(review_score)
)
# 建置對評分的回歸模型
score_lm<-merge(score,reviews2,by="order_id")
score_lm[,2:4] <- sapply(score_lm[,2:4], as.numeric)
score_lm <- score_lm[complete.cases(score_lm),]
#回歸結果
model <- lm(score ~ . ,data = score_lm[,c(2:15)])
summary(model)
Call:
lm(formula = score ~ ., data = score_lm[, c(2:15)])
Residuals:
Min 1Q Median 3Q Max
-4.142 -0.416 0.511 0.758 8.714
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 146.676094324 16.673163569 8.80 < 0.0000000000000002 ***
ship13 -0.049347295 0.000507651 -97.21 < 0.0000000000000002 ***
ship12 -0.005992261 0.001248453 -4.80 0.000001591111096 ***
ship23 NA NA NA NA
ship_ratio -0.102628272 0.013671220 -7.51 0.000000000000061 ***
price -0.000180322 0.000019886 -9.07 < 0.0000000000000002 ***
AVEvolume 0.000000596 0.000000145 4.12 0.000038470316906 ***
description 0.000017458 0.000005592 3.12 0.00180 **
photo_num 0.000393133 0.002037934 0.19 0.84703
pay_installment -0.005247201 0.001555012 -3.37 0.00074 ***
dist 0.000028964 0.000010916 2.65 0.00797 **
customer_stateAL -0.030491776 0.151932969 -0.20 0.84094
customer_stateAM 0.479224704 0.172213086 2.78 0.00539 **
customer_stateAP 0.478096075 0.203018981 2.35 0.01853 *
customer_stateBA -0.200615374 0.140677011 -1.43 0.15385
customer_stateCE -0.082217496 0.143113136 -0.57 0.56564
customer_stateDF -0.312117403 0.141656775 -2.20 0.02757 *
customer_stateES -0.228977322 0.141859044 -1.61 0.10651
customer_stateGO -0.211966355 0.141937848 -1.49 0.13534
customer_stateMA -0.183800394 0.146374141 -1.26 0.20923
customer_stateMG -0.301321084 0.139791712 -2.16 0.03113 *
customer_stateMS -0.139833637 0.146508268 -0.95 0.33986
customer_stateMT -0.047076882 0.145234659 -0.32 0.74583
customer_statePA -0.005394138 0.144658238 -0.04 0.97025
customer_statePB -0.016210304 0.149039725 -0.11 0.91339
customer_statePE -0.089017302 0.142290724 -0.63 0.53158
customer_statePI -0.108850982 0.149998715 -0.73 0.46804
customer_statePR -0.255461151 0.140426033 -1.82 0.06889 .
customer_stateRJ -0.367436846 0.139746382 -2.63 0.00856 **
customer_stateRN 0.024686391 0.150197592 0.16 0.86945
customer_stateRO 0.073445526 0.160002503 0.46 0.64622
customer_stateRR 0.344487911 0.240709613 1.43 0.15239
customer_stateRS -0.154763067 0.140198252 -1.10 0.26964
customer_stateSC -0.216784107 0.140776966 -1.54 0.12359
customer_stateSE -0.114351276 0.154268955 -0.74 0.45855
customer_stateSP -0.390899799 0.139637944 -2.80 0.00512 **
customer_stateTO -0.032079331 0.157522517 -0.20 0.83863
seller_stateBA 0.310311297 0.701754252 0.44 0.65835
seller_stateCE 0.548047705 0.712232351 0.77 0.44161
seller_stateDF 0.154739715 0.701112259 0.22 0.82532
seller_stateES 0.215570007 0.703302483 0.31 0.75922
seller_stateGO 0.440912837 0.702134681 0.63 0.53003
seller_stateMA 0.308390531 0.702456053 0.44 0.66065
seller_stateMG 0.303042716 0.699923113 0.43 0.66504
seller_stateMS 0.479169855 0.722690379 0.66 0.50731
seller_stateMT 0.349251758 0.707548811 0.49 0.62158
seller_statePA 0.667659362 0.820354156 0.81 0.41572
seller_statePB 0.269953993 0.733877879 0.37 0.71299
seller_statePE 0.293853538 0.702610819 0.42 0.67578
seller_statePI 0.478320207 0.789359648 0.61 0.54454
seller_statePR 0.304523535 0.699920481 0.44 0.66350
seller_stateRJ 0.288714064 0.700050066 0.41 0.68003
seller_stateRN 0.183959834 0.720481757 0.26 0.79847
seller_stateRO 0.048482815 0.856778615 0.06 0.95487
seller_stateRS 0.320738178 0.700340490 0.46 0.64697
seller_stateSC 0.310799400 0.700071270 0.44 0.65708
seller_stateSE -0.364078942 0.820432032 -0.44 0.65721
seller_stateSP 0.182444172 0.699815389 0.26 0.79432
time_group -0.000702833 0.000082589 -8.51 < 0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.21 on 93393 degrees of freedom
Multiple R-squared: 0.125, Adjusted R-squared: 0.124
F-statistic: 233 on 57 and 93393 DF, p-value: <0.0000000000000002
#####4-2.相關係數
#相關係數視覺化
# Libraries
library(GGally)
# Check correlation between variables
cor(score_lm[,c(2:11,14,15)])
ship13 ship12 ship23 ship_ratio price
ship13 1.0000000 0.3864676 0.9283265 0.0766294 0.0481980
ship12 0.3864676 1.0000000 0.0158873 -0.0212227 0.0668187
ship23 0.9283265 0.0158873 1.0000000 0.0916289 0.0253181
ship_ratio 0.0766294 -0.0212227 0.0916289 1.0000000 -0.2723965
price 0.0481980 0.0668187 0.0253181 -0.2723965 1.0000000
AVEvolume 0.0621467 0.1181375 0.0197542 -0.0806547 0.3094009
description 0.0050792 0.0057164 0.0032022 -0.1140959 0.2676434
photo_num -0.0224812 -0.0352931 -0.0101459 -0.0470165 0.1166358
pay_installment 0.0742081 0.0578195 0.0571433 -0.1857385 0.3161935
dist 0.2670480 0.0080955 0.2862451 0.1404717 0.1125701
time_group -0.0597468 -0.0806666 -0.0322563 0.0093886 -0.0040814
score -0.3361920 -0.1544938 -0.3021936 -0.0270974 -0.0311427
AVEvolume description photo_num pay_installment dist
ship13 0.062147 0.0050792 -0.022481 0.074208 0.2670480
ship12 0.118138 0.0057164 -0.035293 0.057819 0.0080955
ship23 0.019754 0.0032022 -0.010146 0.057143 0.2862451
ship_ratio -0.080655 -0.1140959 -0.047017 -0.185738 0.1404717
price 0.309401 0.2676434 0.116636 0.316194 0.1125701
AVEvolume 1.000000 0.1158442 0.109449 0.171008 0.0597506
description 0.115844 1.0000000 0.223843 0.074115 0.1392289
photo_num 0.109449 0.2238430 1.000000 0.037662 0.1102882
pay_installment 0.171008 0.0741153 0.037662 1.000000 0.0874214
dist 0.059751 0.1392289 0.110288 0.087421 1.0000000
time_group -0.039378 0.0221935 -0.013546 -0.060023 -0.0359676
score -0.018784 0.0111701 0.012524 -0.030453 -0.0380816
time_group score
ship13 -0.0597468 -0.3361920
ship12 -0.0806666 -0.1544938
ship23 -0.0322563 -0.3021936
ship_ratio 0.0093886 -0.0270974
price -0.0040814 -0.0311427
AVEvolume -0.0393776 -0.0187840
description 0.0221935 0.0111701
photo_num -0.0135455 0.0125242
pay_installment -0.0600233 -0.0304530
dist -0.0359676 -0.0380816
time_group 1.0000000 -0.0078071
score -0.0078071 1.0000000
# Check correlations (as scatterplots), distribution and print corrleation coefficient
#ggpairs(score_lm[,c(2:11,14,15)])
# Nice visualization of correlations
ggcorr(score_lm[,c(2:11,14,15)], method = c("everything", "pearson"))
seller_TOP100 <-
M_8 %>% group_by(seller_id,product_category_name_english) %>% summarise(
mount = sum(price),
num = n()) %>% arrange(desc(num)) %>% head(100)
seller_TOP100_2 <- merge(seller_TOP100,closed[,c(1,2,6,8,9,12)],by="seller_id",all.x = T)
marketingdata <- merge(closed[,c(1,2,6,8,9,12)],marketing[,c(1,4)],by = "mql_id")
M_9 <- merge(M_8,marketingdata,by = "seller_id",all.x = T)
#賣家的性格分類
#分析
A_1 <- M_9 %>% group_by(lead_behaviour_profile,business_segment) %>% summarise(
n_business = n(),
mount = sum(price),
every_mount = mount/n_business
) %>% arrange(desc(lead_behaviour_profile))
table(A_1$lead_behaviour_profile)
cat cat, wolf eagle shark wolf
23 26 1 22 7 15
ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
geom_point(data= M_9,aes(x=custlng,y=custlat,color=lead_behaviour_profile),size=0.2)
ggplot() +
geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="gray")+
geom_point(data= M_9[M_9$lead_behaviour_profile=="wolf",],aes(x=custlng,y=custlat,color=lead_behaviour_profile),size=0.2)
Warning: Removed 108718 rows containing missing values (geom_point).
#貨物與體積的關係
v <- mutate(products, product_volume= product_length_cm*product_height_cm*product_width_cm)
ggplot(v, aes(product_photos_qty, product_name_lenght))+ geom_point()
Warning: Removed 610 rows containing missing values (geom_point).
ggplot(v, aes(product_volume))+ geom_bar()
Warning: Removed 2 rows containing non-finite values (stat_count).
table(v$product_photos_qty) ##1 photo
1 2 3 4 5 6 7 8 9 10 11 12 13
16489 6263 3860 2428 1484 968 343 192 105 95 46 35 9
14 15 17 18 19 20
5 8 7 2 1 1
table(v$product_category_name, v$product_photos_qty)
1 2 3 4 5 6
0 0 0 0 0 0
agro_industria_e_comercio 41 10 7 4 2 8
alimentos 56 6 7 3 4 4
alimentos_bebidas 66 21 5 4 0 2
artes 32 12 3 0 1 3
artes_e_artesanato 6 2 4 2 4 0
artigos_de_festas 19 4 0 0 0 2
artigos_de_natal 40 4 5 3 2 1
audio 31 10 7 6 2 2
automotivo 747 320 300 165 196 145
bebes 382 225 123 99 35 31
bebidas 60 8 9 1 0 1
beleza_saude 1732 315 176 114 60 25
brinquedos 537 358 212 137 81 42
cama_mesa_banho 2283 507 128 57 27 21
casa_conforto 49 17 43 0 2 0
casa_conforto_2 4 1 0 0 0 0
casa_construcao 70 46 13 32 14 21
cds_dvds_musicais 0 1 0 0 0 0
cine_foto 14 6 4 0 3 0
climatizacao 67 19 7 16 12 2
consoles_games 138 68 46 36 19 4
construcao_ferramentas_construcao 202 66 41 30 18 31
construcao_ferramentas_ferramentas 18 11 6 0 1 2
construcao_ferramentas_iluminacao 44 12 7 7 2 4
construcao_ferramentas_jardim 58 6 1 2 11 10
construcao_ferramentas_seguranca 40 17 17 11 4 2
cool_stuff 358 166 110 81 33 21
dvds_blu_ray 40 3 2 1 0 2
eletrodomesticos 259 52 41 9 2 0
eletrodomesticos_2 42 18 13 7 2 3
eletronicos 282 92 84 28 18 7
eletroportateis 110 46 33 22 10 7
esporte_lazer 1576 492 269 214 158 79
fashion_bolsas_e_acessorios 299 115 118 131 110 48
fashion_calcados 3 15 20 75 19 15
fashion_esporte 3 13 0 1 0 2
fashion_roupa_feminina 6 13 1 4 0 3
fashion_roupa_infanto_juvenil 1 0 2 1 1 0
fashion_roupa_masculina 26 57 7 4 0 1
fashion_underwear_e_moda_praia 14 12 15 1 2 4
ferramentas_jardim 339 140 99 76 44 26
flores 12 1 1 0 0 0
fraldas_higiene 7 2 1 1 0 0
industria_comercio_e_negocios 36 12 9 6 3 1
informatica_acessorios 848 342 231 120 58 25
instrumentos_musicais 113 52 39 23 29 12
la_cuisine 9 1 0 0 0 0
livros_importados 26 3 0 0 0 1
livros_interesse_geral 155 22 11 8 8 7
livros_tecnicos 117 4 0 1 0 1
malas_acessorios 152 54 49 29 24 14
market_place 38 20 20 13 5 4
moveis_colchao_e_estofado 4 4 0 2 0 0
moveis_cozinha_area_de_servico_jantar_e_jardim 33 20 9 11 13 2
moveis_decoracao 1119 650 403 209 98 50
moveis_escritorio 260 27 9 4 1 7
moveis_quarto 19 16 4 3 1 2
moveis_sala 63 56 20 10 6 1
musica 10 8 6 2 1 0
papelaria 295 182 157 90 53 26
pc_gamer 0 0 1 1 0 1
pcs 12 5 0 9 2 1
perfumaria 587 172 57 25 10 9
pet_shop 298 179 103 51 28 28
portateis_casa_forno_e_cafe 18 2 4 2 1 2
portateis_cozinha_e_preparadores_de_alimentos 6 1 0 2 1 0
relogios_presentes 541 370 205 101 55 28
seguros_e_servicos 1 0 0 1 0 0
sinalizacao_e_seguranca 39 21 14 6 5 1
tablets_impressao_imagem 4 0 2 0 2 0
telefonia 368 301 176 95 49 110
telefonia_fixa 28 27 32 12 8 1
utilidades_domesticas 1177 403 312 207 124 53
7 8 9 10 11 12
0 0 0 0 0 0
agro_industria_e_comercio 1 1 0 0 0 0
alimentos 2 0 0 0 0 0
alimentos_bebidas 4 1 0 1 0 0
artes 3 0 0 1 0 0
artes_e_artesanato 1 0 0 0 0 0
artigos_de_festas 1 0 0 0 0 0
artigos_de_natal 1 3 6 0 0 0
audio 0 0 0 0 0 0
automotivo 11 10 4 2 0 0
bebes 13 1 4 5 0 0
bebidas 0 0 2 0 0 0
beleza_saude 10 7 2 3 0 0
brinquedos 17 10 7 7 1 1
cama_mesa_banho 1 4 1 0 0 0
casa_conforto 0 0 0 0 0 0
casa_conforto_2 0 0 0 0 0 0
casa_construcao 22 7 0 0 0 0
cds_dvds_musicais 0 0 0 0 0 0
cine_foto 1 0 0 0 0 0
climatizacao 1 0 0 0 0 0
consoles_games 2 2 2 0 0 0
construcao_ferramentas_construcao 6 4 1 1 0 0
construcao_ferramentas_ferramentas 0 1 0 0 0 0
construcao_ferramentas_iluminacao 2 0 0 0 0 0
construcao_ferramentas_jardim 0 0 0 0 0 0
construcao_ferramentas_seguranca 0 0 0 0 0 0
cool_stuff 11 3 1 4 0 0
dvds_blu_ray 0 0 0 0 0 0
eletrodomesticos 1 1 0 5 0 0
eletrodomesticos_2 4 0 1 0 0 0
eletronicos 5 0 0 1 0 0
eletroportateis 2 0 0 0 0 0
esporte_lazer 27 34 7 10 1 0
fashion_bolsas_e_acessorios 17 4 0 0 0 0
fashion_calcados 0 1 0 0 0 25
fashion_esporte 0 0 0 0 0 0
fashion_roupa_feminina 0 0 0 0 0 0
fashion_roupa_infanto_juvenil 0 0 0 0 0 0
fashion_roupa_masculina 0 0 0 0 0 0
fashion_underwear_e_moda_praia 0 0 2 0 2 1
ferramentas_jardim 15 6 5 2 1 0
flores 0 0 0 0 0 0
fraldas_higiene 0 1 0 0 0 0
industria_comercio_e_negocios 1 0 0 0 0 0
informatica_acessorios 8 6 1 0 0 0
instrumentos_musicais 3 7 7 4 0 0
la_cuisine 0 0 0 0 0 0
livros_importados 1 0 0 0 0 0
livros_interesse_geral 2 1 1 1 0 0
livros_tecnicos 0 0 0 0 0 0
malas_acessorios 10 6 5 6 0 0
market_place 0 1 3 0 0 0
moveis_colchao_e_estofado 0 0 0 0 0 0
moveis_cozinha_area_de_servico_jantar_e_jardim 4 2 0 0 0 0
moveis_decoracao 42 24 19 9 33 0
moveis_escritorio 1 0 0 0 0 0
moveis_quarto 0 0 0 0 0 0
moveis_sala 0 0 0 0 0 0
musica 0 0 0 0 0 0
papelaria 19 8 7 12 0 0
pc_gamer 0 0 0 0 0 0
pcs 1 0 0 0 0 0
perfumaria 6 1 0 0 0 0
pet_shop 7 5 1 4 0 2
portateis_casa_forno_e_cafe 1 1 0 0 0 0
portateis_cozinha_e_preparadores_de_alimentos 0 0 0 0 0 0
relogios_presentes 13 10 5 1 0 0
seguros_e_servicos 0 0 0 0 0 0
sinalizacao_e_seguranca 4 2 1 0 0 0
tablets_impressao_imagem 1 0 0 0 0 0
telefonia 8 4 3 5 5 4
telefonia_fixa 0 2 3 1 2 0
utilidades_domesticas 30 11 4 10 1 2
13 14 15 17 18 19
0 0 0 0 0 0
agro_industria_e_comercio 0 0 0 0 0 0
alimentos 0 0 0 0 0 0
alimentos_bebidas 0 0 0 0 0 0
artes 0 0 0 0 0 0
artes_e_artesanato 0 0 0 0 0 0
artigos_de_festas 0 0 0 0 0 0
artigos_de_natal 0 0 0 0 0 0
audio 0 0 0 0 0 0
automotivo 0 0 0 0 0 0
bebes 0 0 0 0 0 1
bebidas 0 0 0 0 0 0
beleza_saude 0 0 0 0 0 0
brinquedos 0 0 0 0 0 0
cama_mesa_banho 0 0 0 0 0 0
casa_conforto 0 0 0 0 0 0
casa_conforto_2 0 0 0 0 0 0
casa_construcao 0 0 0 0 0 0
cds_dvds_musicais 0 0 0 0 0 0
cine_foto 0 0 0 0 0 0
climatizacao 0 0 0 0 0 0
consoles_games 0 0 0 0 0 0
construcao_ferramentas_construcao 0 0 0 0 0 0
construcao_ferramentas_ferramentas 0 0 0 0 0 0
construcao_ferramentas_iluminacao 0 0 0 0 0 0
construcao_ferramentas_jardim 0 0 0 0 0 0
construcao_ferramentas_seguranca 0 0 0 0 0 0
cool_stuff 1 0 0 0 0 0
dvds_blu_ray 0 0 0 0 0 0
eletrodomesticos 0 0 0 0 0 0
eletrodomesticos_2 0 0 0 0 0 0
eletronicos 0 0 0 0 0 0
eletroportateis 0 1 0 0 0 0
esporte_lazer 0 0 0 0 0 0
fashion_bolsas_e_acessorios 1 2 4 0 0 0
fashion_calcados 0 0 0 0 0 0
fashion_esporte 0 0 0 0 0 0
fashion_roupa_feminina 0 0 0 0 0 0
fashion_roupa_infanto_juvenil 0 0 0 0 0 0
fashion_roupa_masculina 0 0 0 0 0 0
fashion_underwear_e_moda_praia 0 0 0 0 0 0
ferramentas_jardim 0 0 0 0 0 0
flores 0 0 0 0 0 0
fraldas_higiene 0 0 0 0 0 0
industria_comercio_e_negocios 0 0 0 0 0 0
informatica_acessorios 0 0 0 0 0 0
instrumentos_musicais 0 0 0 0 0 0
la_cuisine 0 0 0 0 0 0
livros_importados 0 0 0 0 0 0
livros_interesse_geral 0 0 0 0 0 0
livros_tecnicos 0 0 0 0 0 0
malas_acessorios 0 0 0 0 0 0
market_place 0 0 0 0 0 0
moveis_colchao_e_estofado 0 0 0 0 0 0
moveis_cozinha_area_de_servico_jantar_e_jardim 0 0 0 0 0 0
moveis_decoracao 0 0 1 0 0 0
moveis_escritorio 0 0 0 0 0 0
moveis_quarto 0 0 0 0 0 0
moveis_sala 0 0 0 0 0 0
musica 0 0 0 0 0 0
papelaria 0 0 0 0 0 0
pc_gamer 0 0 0 0 0 0
pcs 0 0 0 0 0 0
perfumaria 0 1 0 0 0 0
pet_shop 0 1 3 7 2 0
portateis_casa_forno_e_cafe 0 0 0 0 0 0
portateis_cozinha_e_preparadores_de_alimentos 0 0 0 0 0 0
relogios_presentes 0 0 0 0 0 0
seguros_e_servicos 0 0 0 0 0 0
sinalizacao_e_seguranca 0 0 0 0 0 0
tablets_impressao_imagem 0 0 0 0 0 0
telefonia 6 0 0 0 0 0
telefonia_fixa 0 0 0 0 0 0
utilidades_domesticas 1 0 0 0 0 0
20
0
agro_industria_e_comercio 0
alimentos 0
alimentos_bebidas 0
artes 0
artes_e_artesanato 0
artigos_de_festas 0
artigos_de_natal 0
audio 0
automotivo 0
bebes 0
bebidas 0
beleza_saude 0
brinquedos 1
cama_mesa_banho 0
casa_conforto 0
casa_conforto_2 0
casa_construcao 0
cds_dvds_musicais 0
cine_foto 0
climatizacao 0
consoles_games 0
construcao_ferramentas_construcao 0
construcao_ferramentas_ferramentas 0
construcao_ferramentas_iluminacao 0
construcao_ferramentas_jardim 0
construcao_ferramentas_seguranca 0
cool_stuff 0
dvds_blu_ray 0
eletrodomesticos 0
eletrodomesticos_2 0
eletronicos 0
eletroportateis 0
esporte_lazer 0
fashion_bolsas_e_acessorios 0
fashion_calcados 0
fashion_esporte 0
fashion_roupa_feminina 0
fashion_roupa_infanto_juvenil 0
fashion_roupa_masculina 0
fashion_underwear_e_moda_praia 0
ferramentas_jardim 0
flores 0
fraldas_higiene 0
industria_comercio_e_negocios 0
informatica_acessorios 0
instrumentos_musicais 0
la_cuisine 0
livros_importados 0
livros_interesse_geral 0
livros_tecnicos 0
malas_acessorios 0
market_place 0
moveis_colchao_e_estofado 0
moveis_cozinha_area_de_servico_jantar_e_jardim 0
moveis_decoracao 0
moveis_escritorio 0
moveis_quarto 0
moveis_sala 0
musica 0
papelaria 0
pc_gamer 0
pcs 0
perfumaria 0
pet_shop 0
portateis_casa_forno_e_cafe 0
portateis_cozinha_e_preparadores_de_alimentos 0
relogios_presentes 0
seguros_e_servicos 0
sinalizacao_e_seguranca 0
tablets_impressao_imagem 0
telefonia 0
telefonia_fixa 0
utilidades_domesticas 0
ggplot(v,aes(product_name_lenght))+ geom_bar() ##左尾分布 集中在接近60
Warning: Removed 610 rows containing non-finite values (stat_count).
ggplot(v,aes(product_description_lenght))+ geom_bar() ##右尾分布 集中在0~1000
Warning: Removed 610 rows containing non-finite values (stat_count).
ggplot(v,aes(product_volume))+ geom_histogram(binwidth = 1500)
Warning: Removed 2 rows containing non-finite values (stat_bin).
by_name <- group_by(v,product_category_name) %>% summarise(avg = mean(product_volume)) %>% as.data.frame() ##貨物平均體積
by_name <- by_name[-1,]
product_category_name_translation.csv$product_category_name<- product_category_name_translation.csv$product_category_name
m <- merge(by_name,product_category_name_translation.csv,all = F)
m <- m[,-3] ##貨物種類-平均體積
#三年熱賣前10大商品
product_year=data.frame(year=format(M_8$order_purchase_timestamp,"%Y"),product=M_8$product_category_name_english)
product2016=subset(product_year,year==2016)
table16=table(product2016$product)
product2017=subset(product_year,year==2017)
table17=table(product2017$product)
product2018=subset(product_year,year==2018)
table18=table(product2018$product)
table16=sort(table16,decreasing=TRUE) %>% head(10) #2016年銷售前10名
table17=sort(table17,decreasing=TRUE) %>% head(10) #2017年銷售前10名
pie(table16[1:5],main="2016")
pie(table17[1:5],main="2017")
pie(table18[1:5],main="2018")
#運費佔價格比例與評分的關聯
#運費佔價格比例與評分的關聯,圖形上看起來,運費佔比越高,消費者對商品評價越低。但運費佔比與評價的相關係數很小
M_15_4<-M_15_2[,-5]
M_15_5=mutate(M_15_4,fdividep=freight_value/price)
ggplot(M_15_5, aes(fdividep, review_score))+
geom_smooth(se = FALSE)
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
model=lm(M_15_5$fdivide~M_15_5$review_score)
summary(model)
Call:
lm(formula = M_15_5$fdivide ~ M_15_5$review_score)
Residuals:
Min 1Q Median 3Q Max
-0.349 -0.186 -0.089 0.073 25.886
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.358286 0.003151 113.7 <0.0000000000000002 ***
M_15_5$review_score -0.009300 0.000741 -12.6 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.349 on 113320 degrees of freedom
Multiple R-squared: 0.00139, Adjusted R-squared: 0.00138
F-statistic: 158 on 1 and 113320 DF, p-value: <0.0000000000000002
cor(M_15_5$fdivide,M_15_5$review_score)
[1] -0.037256
model=lm(M_15_5$freight_value~M_15_5$review_score) #運費高低與評價的相關係數也很小
summary(model)
Call:
lm(formula = M_15_5$freight_value ~ M_15_5$review_score)
Residuals:
Min 1Q Median 3Q Max
-21.2 -6.8 -3.6 1.2 390.1
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 21.6094 0.1423 151.8 <0.0000000000000002 ***
M_15_5$review_score -0.4059 0.0335 -12.1 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 15.8 on 113320 degrees of freedom
Multiple R-squared: 0.0013, Adjusted R-squared: 0.00129
F-statistic: 147 on 1 and 113320 DF, p-value: <0.0000000000000002
cor(M_15_5$freight_value,M_15_5$review_score)
[1] -0.036006
summarise(filter(M_15_5, fdividep < 0.1 ),n())/112650
summarise(filter(filter(M_15_5, fdividep < 0.2), fdividep >= 0.1),n())/112650
summarise(filter(filter(M_15_5, fdividep < 0.3), fdividep >= 0.2),n())/112650
summarise(filter(filter(M_15_5, fdividep < 0.4), fdividep >= 0.3),n())/112650
summarise(filter(M_15_5, fdividep >= 0.4),n())/112650
hist(M_15_5$fdividep,las=2,freq=T,xlab="運費佔比",ylab="數量",main="運費佔比的數量")
#商品價格和評分的關係
#商品價格和評分高低的關係。從圖型來看,大約超過3000元的商品,越貴評分越高越貴的商品評分越高。但商品價格和評分高低相關係數相當低,可推測兩者並無太大關連
plot(M_15_1$price,M_15_1$review_score,main="", ylab="評分", xlab="價錢")
ggplot(M_15_1, aes(price, review_score))+
geom_smooth(se = FALSE)
`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
model=lm(M_15_1$price~M_15_1$review_score)
summary(model)
Call:
lm(formula = M_15_1$price ~ M_15_1$review_score)
Residuals:
Min 1Q Median 3Q Max
-121 -81 -46 14 6615
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 122.860 1.654 74.30 <0.0000000000000002 ***
M_15_1$review_score -0.592 0.389 -1.52 0.13
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 183 on 113320 degrees of freedom
Multiple R-squared: 2.05e-05, Adjusted R-squared: 1.17e-05
F-statistic: 2.32 on 1 and 113320 DF, p-value: 0.128
cor(M_15_1$price,M_15_1$review_score)
[1] -0.0045252
#不同月分的商品獲得5分評價的比例
#不同月分的商品獲得5分評價的比例 : 可見7、8月商品評價較高
t=as.Date(reviews$review_creation_date)
score5<-filter(reviews, review_score ==5 )
ts5=as.Date(score5$review_creation_date)
table(format(ts5,'%m')) %>% sort()
09 10 11 01 02 12 04 03 07 06 05 08
2522 2734 2813 3672 4291 4294 5072 5183 5747 6118 6623 8351
ratio5<-table(format(ts5,'%m'))/table(format(t,'%m'))
ratio5
01 02 03 04 05 06 07 08 09 10
0.56675 0.56520 0.49995 0.53860 0.58887 0.59857 0.62447 0.61536 0.59425 0.59037
11 12
0.57105 0.52999